We collect multiple variables for a particular purpose, knowing that social complexity can hardly be directly explained with bivariate or univariate approaches. As it is difficult to visualize information with high dimensional data; you should consider summarising all these variable into a composite index (latent), as I showed in session 2.
However, if you have a good reason to keep the original data, let me show you some plots you may use.
This time, I will use the data about city safety:
library(openxlsx)
link="https://github.com/EvansDataScience/data/raw/master/safeCitiesIndexAll.xlsx"
safe=read.xlsx(link)
The variables available are:
names(safe)
## [1] "city" "D_In_PrivacyPolicy"
## [3] "D_In_AwarenessDigitalThreats" "D_In_PubPrivPartnerships"
## [5] "D_In_TechnologyEmployed" "D_In_CyberSecurity"
## [7] "D_Out_IdentityTheft" "D_Out_CompInfected"
## [9] "D_Out_InternetAccess" "H_In_EnvironmentPolicies"
## [11] "H_In_AccessHealthcare" "H_In_Beds_1000"
## [13] "H_In_Doctors_1000" "H_In_AccessFood"
## [15] "H_In_QualityHealthServ" "H_Out_AirQuality"
## [17] "H_Out_WaterQuality" "H_Out_LifeExpectY"
## [19] "H_Out_InfMortality" "H_Out_CancerMortality"
## [21] "H_Out_AttacksBioChemRad" "I_In_EnforceTransportSafety"
## [23] "I_In_PedestrianFriendliness" "I_In_QualityRoad"
## [25] "I_In_QualityElectricity" "I_In_DisasterManagement"
## [27] "I_Out_DeathsDisaster" "I_Out_VehicularAccidents"
## [29] "I_Out_PedestrianDeath" "I_Out_LiveSlums"
## [31] "I_Out_AttacksInfrastructure" "P_In_PoliceEngage"
## [33] "P_In_CommunityPatrol" "P_In_StreetCrimeData"
## [35] "P_In_TechForCrime" "P_In_PrivateSecurity"
## [37] "P_In_GunRegulation" "P_In_PoliticalStability"
## [39] "P_Out_PettyCrime" "P_Out_ViolentCrime"
## [41] "P_Out_OrganisedCrime" "P_Out_Corruption"
## [43] "P_Out_DrugUse" "P_Out_TerroristAttacks"
## [45] "P_Out_SeverityTerrorist" "P_Out_GenderSafety"
## [47] "P_Out_PerceptionSafety" "P_Out_ThreaTerrorism"
## [49] "P_Out_ThreatMilitaryConf" "P_Out_ThreatCivUnrest"
These several variables are telling us information about the safety levels of some cities in the world, and are related to Digital, Health, Infrastructure, and Personal dimensions. For each of these dimensions, there are measures of actions taken (In), and results (Out). We have 49 variables.
Would making a plot of 49 variables be a good idea?
Let’s make a correlation matrix:
cormat <- round(cor(safe[,-1]),2)
The correlation matrix can be plotted if we turn it into a long data frame:
library(reshape2)
long_cormat <- melt(cormat)
head(long_cormat)
## Var1 Var2 value
## 1 D_In_PrivacyPolicy D_In_PrivacyPolicy 1.00
## 2 D_In_AwarenessDigitalThreats D_In_PrivacyPolicy 0.60
## 3 D_In_PubPrivPartnerships D_In_PrivacyPolicy 0.53
## 4 D_In_TechnologyEmployed D_In_PrivacyPolicy 0.76
## 5 D_In_CyberSecurity D_In_PrivacyPolicy 0.37
## 6 D_Out_IdentityTheft D_In_PrivacyPolicy -0.11
library(ggplot2)
ggplot(data = long_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
get_upper_tri <- function(matrix){
matrix[lower.tri(matrix)]= NA
return(matrix)
}
# then
upper_tri <- get_upper_tri(cormat)
upper_tri_long=melt(upper_tri, na.rm = TRUE)
base = ggplot(data = upper_tri_long, aes(Var2, Var1)) + theme_classic()
corrs = base + geom_tile(aes(fill = value),color = "white")
corrs
corrs = corrs + scale_fill_gradient2(low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1),
space = "Lab",
name="Pearson\nCorrelation")
corrs
corrs = corrs + theme(axis.text.x = element_text(angle = 45,
vjust = 1,
size = 12,
hjust = 1))
corrs
There is GGally a package that help ggplots make this work easier:
library(GGally) # may need to install
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggcorr(safe[,-1], # Data without melting
hjust = 0.9,# distance to plot (diagonal)
size=1, # font size
layout.exp=4, # width so that variable names are shown
low = 'red',high = 'blue') # color scale
When using ggcorr you can work as in ggplot, adding layers:
base= ggcorr(safe[,-1],size=1,layout.exp=4,hjust=0.9,
nbreaks = 3, # 3 intervals
palette = "PuOr")
base + guides(fill=guide_legend("some title")) # if you need a title for legend
However, this tells you information about the variables (columns), but not about the cases (rows). You may try the heatmap to see cases and variables.
Heatmaps will show you the whole data set. First, we need some reshaping:
safeA=melt(safe, # all the data
id.vars = 'city') # BUT state the unit of analysis (unique)
head(safeA)
## city variable value
## 1 Abu Dhabi D_In_PrivacyPolicy 50
## 2 Amsterdam D_In_PrivacyPolicy 100
## 3 Athens D_In_PrivacyPolicy 75
## 4 Bangkok D_In_PrivacyPolicy 25
## 5 Barcelona D_In_PrivacyPolicy 100
## 6 Beijing D_In_PrivacyPolicy 75
Again, the melting changed the direction of the data: the columns were sent into rows. This looks like panel data format or long format (the original is the wide format). Now, the heatmap using this format:
base = ggplot(data = safeA, aes(x = variable,
y =city))
heat1= base + geom_tile(aes(fill = value))
heat1
Here you can see what rows have higher or lower colors on what set of variables. You can add color pallette:
#inverse color -1
heat2 = heat1 + scale_fill_gradient(low = 'grey',high = "black")
#scale_fill_distiller(palette = "RdYlGn",direction = 1,)
heat2
The column and row names need some work:
heat2 + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
size = 4),
axis.text.y = element_text(size = 4))
The last heatmap above could be ‘ordered’ so that column and row positions can give us more information:
# change in REORDER
base= ggplot(data = safeA, aes(x = reorder(variable,
value, median, order=TRUE),
y =reorder(city,
value, median, order=TRUE)))
# THIS IS THE SAME
base + geom_tile(aes(fill = value)) +
scale_fill_gradient(low = 'grey90',high = "grey50") +
theme(axis.text.x = element_text(angle = 90, hjust = 1,size = 4),
axis.text.y = element_text(size = 4))
This is still hard to read. An alternative could be to average each dimension:
library(magrittr)
safe$meanDIN=apply(safe[,c(grep("D_In_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanDOUT=apply(safe[,c(grep("D_Out_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanHIN=apply(safe[,c(grep("H_In_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanHOUT=apply(safe[,c(grep("H_Out_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanIIN=apply(safe[,c(grep("I_In_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanIOUT=apply(safe[,c(grep("I_Out_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanPIN=apply(safe[,c(grep("P_In_", colnames(safe) ))],1,mean)%>%round(2)
safe$meanPOUT=apply(safe[,c(grep("P_Out_", colnames(safe) ))],1,mean)%>%round(2)
Let’s keep all the Input variables:
safeINS=safe[,c(1,grep("IN$", colnames(safe)))] # end with
names(safeINS)=c("city",'HEALTH','DIGITAL','INFRASTR','PERSONAL')
head(safeINS)
## city HEALTH DIGITAL INFRASTR PERSONAL
## 1 Abu Dhabi 63.33 55.91 90.0 71.80
## 2 Amsterdam 80.00 78.95 94.0 92.61
## 3 Athens 60.00 64.93 72.0 64.54
## 4 Bangkok 28.33 55.28 60.5 61.61
## 5 Barcelona 80.00 73.86 96.5 90.95
## 6 Beijing 63.33 69.49 70.5 85.00
Just reshaping for ggplot:
safeINS_long=melt(safeINS,id.vars = 'city')
head(safeINS_long)
## city variable value
## 1 Abu Dhabi HEALTH 63.33
## 2 Amsterdam HEALTH 80.00
## 3 Athens HEALTH 60.00
## 4 Bangkok HEALTH 28.33
## 5 Barcelona HEALTH 80.00
## 6 Beijing HEALTH 63.33
We are using a radar plot this time:
base = ggplot(safeINS_long, aes(x = variable, y = value, group = city))
plot1 = base + geom_polygon(fill = 'gray',col='orange') + coord_polar()
plot2 = plot1 + facet_wrap(~city,# one plot per city
ncol = 10) # ten plot per row
plot2
The radar plot describes how a cases is doing in every dimension (we have four dimensions).
We could improve the plot by ordering the facet and increasing the font size ofthe name of dimensions (X), and having less columns:
plot2 = plot1 + facet_wrap(~reorder(city,value, median, order=TRUE),ncol = 7)
plot3 = plot2 + theme(axis.text.x = element_text(size = 8))
plot3
We can also highlight the case’s names, let’s change the theme from above:
plot3 = plot2 + theme(axis.text.x = element_text(size = 8),
legend.position="none",
strip.text = element_text(size = 20)) #here!!!
plot3
You could add extra customization if wanted:
### arguments
newBackGroundGrid=element_rect(fill = "white",
colour = "red",
size = 3,
linetype = "dashed")
newBackLineGrid=element_line(size = 3,
linetype = 'solid',
colour = "lightblue")
### more customization
plot3+ theme(panel.background = newBackGroundGrid,
panel.grid.major = newBackLineGrid)
The colors above are not the best choice, I just used them for you to notice where to make changes. Keep in mind that areas are difficult to compare, so the plots above might be used with care.
Finally, let me work with the health data for both input and output:
safeH=safe[,c(1,grep("meanH", colnames(safe) ))]
# flag to know when output was higher than input
safeH$upH=safeH$meanHOU>safeH$meanHIN
# check
head(safeH)
## city meanHIN meanHOUT upH
## 1 Abu Dhabi 55.91 81.17 TRUE
## 2 Amsterdam 78.95 80.64 TRUE
## 3 Athens 64.93 84.22 TRUE
## 4 Bangkok 55.28 78.00 TRUE
## 5 Barcelona 73.86 83.22 TRUE
## 6 Beijing 69.49 65.77 FALSE
Now plot:
library(ggplot2)
base=ggplot(data=safeH,
aes(x=meanHIN,y=meanHOUT))
io1= base + geom_point()
io1
Use the flag for color point:
io1= base + geom_point(aes(color=upH))
io1=io1 + ylim(c(0,100)) + xlim(c(0,100)) + coord_fixed(ratio = 1)
io1
Show some names:
library(ggrepel)
io1= io1 + geom_text_repel(aes(label=ifelse(upH,"",city)))
io1 = io1 + theme(legend.position = "none")
io1= io1 + labs(title = "title here")
io1
Let e reshape this safeH to long format
# create long format
safeHLong <- melt(safeH, id.vars = c('city','upH'))
Now take a look at the parallel plot:
base = ggplot(safeHLong, aes(x = variable, y = value, group = city))
base + geom_path()
Improve with color:
base + geom_path(aes(color=upH))
Send true cases to second plane:
base + geom_path(aes(color=upH,alpha=ifelse(upH,0.5,1)))
We add labels:
parallel= base + geom_path(aes(color=upH,alpha=ifelse(upH,0.5,1)))
parallel= parallel + geom_text_repel(aes(label=ifelse(upH,"",city)),
size=2)
parallel
Improving previous plot:
# no legend
parallel= parallel + theme(legend.position = "none")
# shrink empty areas
parallel= parallel + scale_x_discrete(expand = c(0.1,0))
parallel
Alternative to previous plot:
base2 = ggplot(safeHLong[!safeHLong$upH,],
aes(x = variable, y = value, group = city))
parallel1= base2 + geom_path(aes(color=city))
parallel1
Improving previous plot:
# labels
parallel1 = parallel1 + geom_text_repel(aes(label=ifelse(upH,"",city)),
size=4)
# no legend
parallel1= parallel1 + theme(legend.position = "none")
# shrink empty spaces
parallel1= parallel1 + scale_x_discrete(expand = c(0.1,0))
parallel1= parallel1 + labs(title = "title here")
parallel1
Let’s explore this difference:
summary(safeH$meanHOUT-safeH$meanHIN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17.700 4.628 14.485 14.145 24.425 34.990
Let’s keep the difference above 20% and above 30%
safeH$jumpH20=abs(safeH$meanHOUT-safeH$meanHIN)>20
safeH$jumpH30=abs(safeH$meanHOUT-safeH$meanHIN)>30
Recreate long format:
# create long format
safeHLong <- melt(safeH, id.vars = c('city','upH','jumpH20','jumpH30'))
These are cities that do very well:
base3 = ggplot(safeHLong[safeHLong$jumpH20,],
aes(x = variable, y = value, group = city))
parallel2= base3 + geom_path()
parallel2= parallel2+ geom_text_repel(aes(label=ifelse(jumpH20,city,"")),
size=2)
parallel2 + theme(legend.position = "none")
You color the cities that went extraordinary:
base3 = ggplot(safeHLong[safeHLong$jumpH20,],
aes(x = variable, y = value, group = city))
parallel2= base3 + geom_path(aes(color=jumpH30)) #coloring
parallel2= parallel2+ geom_text_repel(aes(label=ifelse(jumpH20,city,"")),
size=2)
parallel2
Using facet to help the eye:
safeHLong$jumpH30=factor(safeHLong$jumpH30,
levels=c(TRUE,FALSE),
labels = c('above 30%','20%-30%'))
base3 = ggplot(safeHLong[safeHLong$jumpH20,],
aes(x = variable, y = value, group = city))
parallel2= base3 + geom_path(aes(color=jumpH30))
parallel2= parallel2+ geom_text_repel(aes(label=ifelse(jumpH20,city,"")),
size=4)
parallel2= parallel2 + scale_x_discrete(expand = c(0.1,0))
parallel2= parallel2 + theme(legend.position = "none")
parallel2= parallel2 + facet_grid(~jumpH30)
parallel2= parallel2 + labs(title = "title here")
parallel2
Combining the previous plots:
library(ggpubr)
# embedding ggarrange
row1Plot=parallel2
row2Plot=ggarrange(io1, parallel1,
ncol = 2)
ggarrange(row1Plot,row2Plot,
nrow = 2
)